"""
对数据进行分词和保存
"""
from tqdm import tqdm  # tqdm(可迭代对象)，可以打印进度条
import pickle  # 模型的保存和加载所用包
from dataset import tokens
from word_sequence import WordSequence as WS
import os


if __name__ == '__main__':
    # 1. 获取所有文件路径
    base_file_path = r"F:\virtual_environment\data\aclImdb_v1\aclImdb"
    file_paths = [os.path.join(base_file_path, "train/pos"), os.path.join(base_file_path, "train/neg")]
    data_file_paths = []
    for file_path in file_paths:
        data_file_paths = [os.path.join(file_path, file) for file in os.listdir(file_path) if file.endswith(".txt")]

    # 2. 读取每个文件，并将文件中的句子进行分词
    ws = WS()
    for data_file_path in tqdm(data_file_paths):
        sentences = tokens(open(data_file_path, encoding="UTF-8").read())

        # 2.1 统计词频
        ws.fit(sentences)

    # 3. 创建词典
    ws.build_dict(min_rate=10, max_word=10000)

    # 4. 保存字典模型
    pickle.dump(ws, open("./model/ws.pkl", mode="wb"))
    print(len(ws))




